{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2023-12-15T01:40:38.080248Z",
     "start_time": "2023-12-15T01:40:37.969459Z"
    }
   },
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'numpy'",
     "output_type": "error",
     "traceback": [
      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[0;31mModuleNotFoundError\u001B[0m                       Traceback (most recent call last)",
      "Cell \u001B[0;32mIn[1], line 2\u001B[0m\n\u001B[1;32m      1\u001B[0m \u001B[38;5;66;03m# import\u001B[39;00m\n\u001B[0;32m----> 2\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mnumpy\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mnp\u001B[39;00m\n\u001B[1;32m      3\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutils\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdata\u001B[39;00m\n\u001B[1;32m      4\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mpandas\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mpd\u001B[39;00m\n",
      "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'numpy'"
     ]
    }
   ],
   "source": [
    "# import\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch.cuda\n",
    "import torch.utils.data\n",
    "from torch import nn\n",
    "from torch.optim.adam import Adam\n",
    "from torch.utils.data import DataLoader, random_split\n",
    "from tqdm import tqdm\n",
    "from transformers import BertTokenizer, BertModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# dataset\n",
    "class Dataset(torch.utils.data.Dataset):\n",
    "    def __init__(self, datas, model_path):\n",
    "        self.labels = datas['label']\n",
    "        tokenizer = BertTokenizer.from_pretrained(model_path)\n",
    "        self.reviews = [\n",
    "            tokenizer(str(review), padding='max_length', max_length=512, truncation=True, return_tensors='pt')\n",
    "            for review in datas['review']]\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.labels)\n",
    "\n",
    "    def __getitem__(self, item):\n",
    "        \"\"\"\n",
    "        默认情况下，DataLoader 的 collate_fn 使用 torch.utils.data._utils.collate.default_collate，\n",
    "        这个函数要求 batch 中的每个元素都是 PyTorch 的 tensor、numpy array、数字、字典或列表。\n",
    "        \"\"\"\n",
    "        return self.reviews[item], np.array(self.labels[item])"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "1b7b9a9e6ac089e5"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# model\n",
    "class BertClassifier(nn.Module):\n",
    "    def __init__(self, model_path, dropout=0.5):\n",
    "        super(BertClassifier, self).__init__()\n",
    "        self.bert = BertModel.from_pretrained(model_path)\n",
    "        self.dropout = nn.Dropout(dropout)\n",
    "        self.linear = nn.Linear(768, 2)\n",
    "        self.relu = nn.ReLU()\n",
    "\n",
    "    def forward(self, input_ids, attention_mask):\n",
    "        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)\n",
    "        dropout_output = self.dropout(pooled_output)\n",
    "        linear_output = self.linear(dropout_output)\n",
    "        final_layer = self.relu(linear_output)\n",
    "        return final_layer"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "ad2b1d7c97c213b8"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# train\n",
    "def train(model, model_save_path, train_dataset, val_dataset, batch_size, lr, epochs):\n",
    "    # DataLoader根据batch_size获取数据，训练时选择打乱样本\n",
    "    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n",
    "    val_loader = DataLoader(val_dataset, batch_size=batch_size)\n",
    "    # 是否使用gpu\n",
    "    use_cuda = torch.cuda.is_available()\n",
    "    device = torch.device(\"cuda:0\" if use_cuda else \"cpu\")\n",
    "    # 定义损失函数和优化器\n",
    "    criterion = nn.CrossEntropyLoss()\n",
    "    optim = Adam(model.parameters(), lr=lr)\n",
    "\n",
    "    if use_cuda:\n",
    "        model = model.to(device)\n",
    "        criterion = criterion.to(device)\n",
    "\n",
    "    best_avg_acc_val = 0\n",
    "    for epoch in range(epochs):\n",
    "        # 训练集损失&准确率\n",
    "        total_loss_train = 0\n",
    "        total_acc_train = 0\n",
    "        # 训练进度\n",
    "        for train_input, train_label in tqdm(train_loader):\n",
    "            model.train()\n",
    "            train_label = train_label.to(device)\n",
    "            attention_mask = train_input['attention_mask'].to(device)\n",
    "            input_ids = train_input['input_ids'].squeeze(1).to(device)\n",
    "            # 模型输出\n",
    "            output = model(input_ids, attention_mask)\n",
    "            # 计算损失\n",
    "            loss = criterion(output, train_label)\n",
    "            total_loss_train += loss\n",
    "            # 计算准确率\n",
    "            acc = (output.argmax(dim=1) == train_label).sum().item()\n",
    "            total_acc_train += acc\n",
    "            # 模型更新\n",
    "            model.zero_grad()\n",
    "            loss.backward()\n",
    "            optim.step()\n",
    "\n",
    "        # 模型验证\n",
    "        total_loss_val = 0\n",
    "        total_acc_val = 0\n",
    "        # 验证无需梯度计算\n",
    "        model.eval()\n",
    "        with torch.no_grad():\n",
    "            # 使用当前epoch训练好的模型验证\n",
    "            for val_input, val_label in val_loader:\n",
    "                val_label = val_label.to(device)\n",
    "                attention_mask = val_input['attention_mask'].to(device)\n",
    "                input_ids = val_input['input_ids'].squeeze(1).to(device)\n",
    "                # 模型输出\n",
    "                output = model(input_ids, attention_mask)\n",
    "                loss = criterion(output, val_label)\n",
    "                total_loss_val += loss\n",
    "                acc = (output.argmax(dim=1) == val_label).sum().item()\n",
    "                total_acc_val += acc\n",
    "\n",
    "        # save model\n",
    "        if (total_acc_val / len(val_dataset)) > best_avg_acc_val:\n",
    "            best_avg_acc_val = total_acc_val / len(val_dataset)\n",
    "            torch.save(model.state_dict(), model_save_path)\n",
    "            print(f'''best model | Val Accuracy: {best_avg_acc_val: .3f}''')\n",
    "        print(\n",
    "            f'''Epochs: {epoch + 1} \n",
    "              | Train Loss: {total_loss_train / len(train_dataset): .3f} \n",
    "              | Train Accuracy: {total_acc_train / len(train_dataset): .3f} \n",
    "              | Val Loss: {total_loss_val / len(val_dataset): .3f} \n",
    "              | Val Accuracy: {total_acc_val / len(val_dataset): .3f}''')\n",
    "\n",
    "\n",
    "def test(model, model_save_path, test_dataset, batch_size):\n",
    "    # 加载最佳模型权重\n",
    "    model.load_state_dict(torch.load(model_save_path))\n",
    "    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)\n",
    "    use_cuda = torch.cuda.is_available()\n",
    "    device = torch.device(\"cuda:0\" if use_cuda else \"cpu\")\n",
    "\n",
    "    if use_cuda:\n",
    "        model = model.to(device)\n",
    "\n",
    "    total_acc_test = 0\n",
    "    model.eval()\n",
    "    with torch.no_grad():\n",
    "        for test_input, test_label in test_dataloader:\n",
    "            test_label = test_label.to(device)\n",
    "            attention_mask = test_input['attention_mask'].to(device)\n",
    "            input_ids = test_input['input_ids'].squeeze(1).to(device)\n",
    "            output = model(input_ids, attention_mask)\n",
    "            acc = (output.argmax(dim=1) == test_label).sum().item()\n",
    "            total_acc_test += acc\n",
    "    print(f'Test Accuracy: {total_acc_test / len(test_dataset): .3f}')\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    batch_size = 24\n",
    "    learn_rate = 1e-4\n",
    "    epochs = 5\n",
    "    # 加载数据\n",
    "    label_datas = pd.read_excel('../train-datas/ChnSentiCorp_htl_all.xlsx')\n",
    "    # 初始化dataset\n",
    "    dateset = Dataset(label_datas, '../bert-base-chinese')\n",
    "    # 创建模型\n",
    "    model = BertClassifier('../bert-base-chinese')\n",
    "    # 分割数据集\n",
    "    total_size = len(label_datas)\n",
    "    train_size = int(0.8 * total_size)\n",
    "    val_size = int(0.1 * total_size)\n",
    "    test_size = total_size - train_size - val_size\n",
    "    # 分割数据集\n",
    "    train_dataset, val_dataset, test_dataset = random_split(dateset, [train_size, val_size, test_size])\n",
    "    print('train begin')\n",
    "    train(model, '../result-model/classifier-model.pkl', train_dataset, val_dataset, batch_size, learn_rate, 5)\n",
    "    print('train finish')\n",
    "    print('test begin')\n",
    "    test(model, '../result-model/classifier-model.pkl', test_dataset, batch_size)\n",
    "    print('test finish')"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "22a59acefd138c93"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
